{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import re, sys, nltk\n",
    "from nltk.tokenize.stanford import StanfordTokenizer\n",
    "path_to_jar = \"/home/shanu/nltk/jars/stanford-postagger.jar\"\n",
    "tokenizer = StanfordTokenizer(path_to_jar)\n",
    "import _pickle as pickle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Extracting the Relations \n",
    "# Please comment this when preprocessing the sentences.\n",
    "# for training data open \"TRAIN_FILE.TXT\" and for test data open \"TEST_FILE_FULL.TXT\"\n",
    "\n",
    "lines = []\n",
    "for line in open(\"data/TRAIN_FILE.TXT\"):\n",
    "    lines.append(line.strip())\n",
    "\n",
    "relations = []\n",
    "for i, w in enumerate(lines):\n",
    "    if((i+3)%4==0):\n",
    "        relations.append(w)\n",
    "        \n",
    "f = open(\"data/train_relations.txt\", 'w')\n",
    "for rel in relations:\n",
    "    f.write(rel+'\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# For preprocessing Training data open \"TRAIN_FILE.TXT and for Test data open \"TEST_FILE.txt\n",
    "\n",
    "lines = []\n",
    "for line in open(\"data/TRAIN_FILE.TXT\"):   \n",
    "    m = re.match(r'^([0-9]+)\\s\"(.+)\"$', line.strip())\n",
    "    if(m is not None):\n",
    "        lines.append(m.group(2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(relations)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "sentences = []\n",
    "e1 = []\n",
    "e2 = []\n",
    "for j,line in enumerate(lines):\n",
    "    text = []\n",
    "    temp = []\n",
    "    t = line.split(\"<e1>\")\n",
    "    text.append(t[0])\n",
    "    temp.append(t[0])\n",
    "\n",
    "    t = t[1].split(\"</e1>\")\n",
    "    e1_text = text\n",
    "    e1_text = \" \".join(e1_text)\n",
    "    e1_text = tokenizer.tokenize(e1_text)\n",
    "    text.append(t[0])\n",
    "    e11= t[0]\n",
    "    y = tokenizer.tokenize(t[0])\n",
    "    y[0] +=\"E11\"\n",
    "    temp.append(\" \".join(y))\n",
    "    t = t[1].split(\"<e2>\")\n",
    "    text.append(t[0])\n",
    "    temp.append(t[0])\n",
    "    t = t[1].split(\"</e2>\")\n",
    "    e22 = t[0]\n",
    "    e2_text = text\n",
    "    e2_text = \" \".join(e2_text)\n",
    "    e2_text = tokenizer.tokenize(e2_text)\n",
    "    text.append(t[0])\n",
    "    text.append(t[1])\n",
    "    y = tokenizer.tokenize(t[0])\n",
    "    y[0] +=\"E22\"\n",
    "    temp.append(\" \".join(y))\n",
    "    temp.append(t[1])\n",
    "\n",
    "    text = \" \".join(text)\n",
    "    text = tokenizer.tokenize(text)\n",
    "    temp = \" \".join(temp)\n",
    "    temp = tokenizer.tokenize(temp)\n",
    "\n",
    "    q1 = tokenizer.tokenize(e11)[0]\n",
    "    q2 = tokenizer.tokenize(e22)[0]\n",
    "    for i, word in enumerate(text):\n",
    "        if(word.find(q1)!=-1):\n",
    "            if(temp[i].find(\"E11\")!=-1):\n",
    "                e1.append(i)            \n",
    "                break\n",
    "    for i, word in enumerate(text):\n",
    "        if(word.find(q2)!=-1):\n",
    "                if(temp[i].find(\"E22\")!=-1):\n",
    "                    e2.append(i)   \n",
    "    text = \" \".join(text)\n",
    "    sentences.append(text)\n",
    "    print(j, text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(sentences), len(e1), len(e2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# for saving training data open \"train_data\" and for test data open \"test_data\"\n",
    "\n",
    "with open('data/train_data', 'wb') as f:\n",
    "    pickle.dump((sentences, e1, e2), f)\n",
    "    f.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
